property <- read.csv("fy2023-property-assessment-data.csv")
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gmodels)
library(reshape2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
property <- clean_names(property)
property_extract<- subset(property, select=
c(city,land_sf,land_value,bldg_value,roof_structure, bed_rms, full_bth, heat_type ))
str(property_extract)
## 'data.frame': 180627 obs. of 8 variables:
## $ city : chr "EAST BOSTON" "EAST BOSTON" "EAST BOSTON" "EAST BOSTON" ...
## $ land_sf : int 1150 1150 1150 1150 2010 2500 2500 2500 2500 2500 ...
## $ land_value : int 195700 196500 197100 197700 225700 261200 262000 262700 263200 224400 ...
## $ bldg_value : int 588500 615300 599200 530200 569100 1027100 993200 876500 613100 802800 ...
## $ roof_structure: chr "F - Flat" "F - Flat" "F - Flat" "M - Mansard" ...
## $ bed_rms : int 6 3 5 5 6 13 14 11 5 6 ...
## $ full_bth : int 3 3 3 3 3 6 5 3 3 3 ...
## $ heat_type : chr "W - Ht Water/Steam" "F - Forced Hot Air" "S - Space Heat" "W - Ht Water/Steam" ...
This summarizes the data types for each column in the dataset. Since it justifies the each data type and column, we dont need to change it.
summary(property_extract)
## city land_sf land_value bldg_value
## Length:180627 Min. : 100 Min. : 0 Min. :0.000e+00
## Class :character 1st Qu.: 1000 1st Qu.: 0 1st Qu.:2.987e+05
## Mode :character Median : 2014 Median : 0 Median :4.892e+05
## Mean : 7816 Mean : 376579 Mean :1.121e+06
## 3rd Qu.: 4770 3rd Qu.: 225000 3rd Qu.:7.449e+05
## Max. :101513565 Max. :486046900 Max. :1.723e+09
## NA's :7545
## roof_structure bed_rms full_bth heat_type
## Length:180627 Min. : 0.00 Min. : 0.000 Length:180627
## Class :character 1st Qu.: 2.00 1st Qu.: 1.000 Class :character
## Mode :character Median : 3.00 Median : 1.000 Mode :character
## Mean : 3.16 Mean : 1.351
## 3rd Qu.: 4.00 3rd Qu.: 2.000
## Max. :17.00 Max. :17.000
## NA's :48287 NA's :11116
V1<- data.frame(property_extract%>%
group_by(city)%>%
summarise(total =n()))
V1$city <- ifelse(V1$city == "",NA, V1$city)
V1<- na.omit(V1)
I have used ifelse() and na.omit() to
remove the empty column from the data frame so to plot the graph
accurately. This gives us 19 cities in total from 20 rows.
ggplot(V1, aes(x = total, y= reorder (city,total), fill= city)) +
geom_bar(stat="identity",show.legend = FALSE) +
labs(title = "Total parcels in each city", x = "Total", y = "City")
The above graph depicts the total number of parcels in each city. The
above graph is plotted using ggplot. According to the chart
above, Boston has the highest number of parcels that is
47,104 and Newton has the lowest number of parcels
which equals to 1. There are a total of 19 cities in the dataset.
V2 <- data.frame(property_extract%>%
group_by(city)%>%
summarise(average_value = mean(bldg_value)))
V2<- arrange(V2, desc(average_value))
table(V2$city)
##
## ALLSTON BOSTON BRIGHTON
## 1 1 1 1
## BROOKLINE CHARLESTOWN CHESTNUT HILL DEDHAM
## 1 1 1 1
## DORCHESTER EAST BOSTON HYDE PARK JAMAICA PLAIN
## 1 1 1 1
## MATTAPAN NEWTON READVILLE ROSLINDALE
## 1 1 1 1
## ROXBURY ROXBURY CROSSING SOUTH BOSTON WEST ROXBURY
## 1 1 1 1
By using table(), it shows one value is blank. Hence we
have used filter() function to remove this particular row
to avoid any misunderstanding.
V2 <- filter(V2, city !="")
ggplot(V2, aes(x = reorder(city, average_value), y = average_value, fill = city)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(title = "Average Property Value by City", x = "City", y = "Average Value") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
scale_y_continuous(labels = scales::dollar_format(prefix = "$"),
breaks = seq(0,4000000, by=500000),
limits = c(0,4200000))
I have created a bar chart to visualize the average property values by city. Average value refers to mean value which can be calculated by dividing total of building value by number of buildings. The above chart shows that READVILLE has the highest average building value followed by BOSTON. On the other hand, DEDHAM has the lowest average property value.
As per previous visualization table ‘V1’, READVILLE only has 2 properties which can be the reason for average value to be valued at such a high.
V3 <- data.frame(property_extract%>%
group_by(heat_type)%>%
summarise(total = n()))
V3$heat_type <- ifelse(V3$heat_type == "",NA, V3$heat_type )
V3<- na.omit(V3)
ggplot(V3, aes(x = "", y = total, fill = heat_type)) +
geom_bar(stat = "identity") +
coord_polar("y", start = 0) +
geom_text(aes(label = total), position = position_stack(vjust = 0.40)) +
labs(title = "Distribution of Heat Types")+
theme_void()
The pie chart above provides a visual representation of the counts of each heating type. It helps us compare different heating types and which is mostly used. As per the above graph, Ht Water/Steam is the most used heating system followed by Forced Hot Air. This shows that more than 50% of the parcels use Ht Water/Steam as their heating system.
V4 <- gmodels::CrossTable(property_extract$city, property_extract$roof_structure)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 180627
##
##
## | property_extract$roof_structure
## property_extract$city | | F - Flat | G - Gable | H - Hip | L - Gambrel | M - Mansard | O - Other | S - Shed | Row Total |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## | 3 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 5 |
## | 3.935 | 1.864 | 0.395 | 0.387 | 0.060 | 0.374 | 0.009 | 0.010 | |
## | 0.600 | 0.000 | 0.400 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## ALLSTON | 892 | 2215 | 824 | 279 | 66 | 143 | 0 | 4 | 4423 |
## | 0.000 | 194.156 | 86.887 | 11.847 | 3.331 | 106.891 | 8.399 | 2.399 | |
## | 0.202 | 0.501 | 0.186 | 0.063 | 0.015 | 0.032 | 0.000 | 0.001 | 0.024 |
## | 0.024 | 0.033 | 0.018 | 0.020 | 0.031 | 0.011 | 0.000 | 0.011 | |
## | 0.005 | 0.012 | 0.005 | 0.002 | 0.000 | 0.001 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## BOSTON | 12233 | 24091 | 2385 | 380 | 66 | 7611 | 321 | 17 | 47104 |
## | 787.804 | 2426.356 | 7824.122 | 2929.447 | 437.476 | 4730.666 | 599.417 | 59.936 | |
## | 0.260 | 0.511 | 0.051 | 0.008 | 0.001 | 0.162 | 0.007 | 0.000 | 0.261 |
## | 0.336 | 0.358 | 0.051 | 0.027 | 0.031 | 0.563 | 0.936 | 0.049 | |
## | 0.068 | 0.133 | 0.013 | 0.002 | 0.000 | 0.042 | 0.002 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## BRIGHTON | 1476 | 5533 | 2656 | 1778 | 185 | 203 | 1 | 38 | 11870 |
## | 351.618 | 276.982 | 52.220 | 800.850 | 13.336 | 529.043 | 20.585 | 10.011 | |
## | 0.124 | 0.466 | 0.224 | 0.150 | 0.016 | 0.017 | 0.000 | 0.003 | 0.066 |
## | 0.041 | 0.082 | 0.057 | 0.127 | 0.086 | 0.015 | 0.003 | 0.109 | |
## | 0.008 | 0.031 | 0.015 | 0.010 | 0.001 | 0.001 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## BROOKLINE | 7 | 2 | 10 | 3 | 2 | 0 | 0 | 0 | 24 |
## | 0.965 | 5.396 | 2.365 | 0.699 | 10.262 | 1.797 | 0.046 | 0.046 | |
## | 0.292 | 0.083 | 0.417 | 0.125 | 0.083 | 0.000 | 0.000 | 0.000 | 0.000 |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.001 | 0.000 | 0.000 | 0.000 | |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## CHARLESTOWN | 1236 | 3706 | 1526 | 165 | 28 | 552 | 4 | 13 | 7230 |
## | 33.744 | 378.594 | 60.325 | 278.818 | 39.312 | 0.212 | 6.895 | 0.062 | |
## | 0.171 | 0.513 | 0.211 | 0.023 | 0.004 | 0.076 | 0.001 | 0.002 | 0.040 |
## | 0.034 | 0.055 | 0.033 | 0.012 | 0.013 | 0.041 | 0.012 | 0.037 | |
## | 0.007 | 0.021 | 0.008 | 0.001 | 0.000 | 0.003 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## CHESTNUT HILL | 41 | 380 | 177 | 417 | 1 | 0 | 0 | 1 | 1017 |
## | 131.256 | 0.002 | 27.459 | 1451.435 | 10.210 | 76.140 | 1.931 | 0.470 | |
## | 0.040 | 0.374 | 0.174 | 0.410 | 0.001 | 0.000 | 0.000 | 0.001 | 0.006 |
## | 0.001 | 0.006 | 0.004 | 0.030 | 0.000 | 0.000 | 0.000 | 0.003 | |
## | 0.000 | 0.002 | 0.001 | 0.002 | 0.000 | 0.000 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## DEDHAM | 2 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 6 |
## | 0.516 | 2.237 | 3.904 | 0.465 | 0.072 | 0.449 | 0.011 | 0.012 | |
## | 0.333 | 0.000 | 0.667 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## DORCHESTER | 5894 | 8689 | 9942 | 3435 | 428 | 785 | 6 | 33 | 29212 |
## | 0.003 | 445.536 | 780.491 | 606.309 | 18.208 | 898.780 | 44.121 | 9.630 | |
## | 0.202 | 0.297 | 0.340 | 0.118 | 0.015 | 0.027 | 0.000 | 0.001 | 0.162 |
## | 0.162 | 0.129 | 0.214 | 0.245 | 0.199 | 0.058 | 0.017 | 0.095 | |
## | 0.033 | 0.048 | 0.055 | 0.019 | 0.002 | 0.004 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## EAST BOSTON | 1918 | 5550 | 1601 | 359 | 43 | 451 | 1 | 28 | 9951 |
## | 3.897 | 912.192 | 360.150 | 220.208 | 48.248 | 116.022 | 16.949 | 4.065 | |
## | 0.193 | 0.558 | 0.161 | 0.036 | 0.004 | 0.045 | 0.000 | 0.003 | 0.055 |
## | 0.053 | 0.082 | 0.034 | 0.026 | 0.020 | 0.033 | 0.003 | 0.080 | |
## | 0.011 | 0.031 | 0.009 | 0.002 | 0.000 | 0.002 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## HYDE PARK | 1580 | 441 | 5823 | 900 | 173 | 218 | 1 | 71 | 9207 |
## | 41.158 | 2607.545 | 5031.072 | 48.802 | 36.385 | 322.246 | 15.541 | 159.924 | |
## | 0.172 | 0.048 | 0.632 | 0.098 | 0.019 | 0.024 | 0.000 | 0.008 | 0.051 |
## | 0.043 | 0.007 | 0.125 | 0.064 | 0.080 | 0.016 | 0.003 | 0.204 | |
## | 0.009 | 0.002 | 0.032 | 0.005 | 0.001 | 0.001 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## JAMAICA PLAIN | 1581 | 3810 | 4390 | 1611 | 284 | 409 | 4 | 15 | 12104 |
## | 302.724 | 109.525 | 521.202 | 483.089 | 135.127 | 272.788 | 15.681 | 2.968 | |
## | 0.131 | 0.315 | 0.363 | 0.133 | 0.023 | 0.034 | 0.000 | 0.001 | 0.067 |
## | 0.043 | 0.057 | 0.094 | 0.115 | 0.132 | 0.030 | 0.012 | 0.043 | |
## | 0.009 | 0.021 | 0.024 | 0.009 | 0.002 | 0.002 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## MATTAPAN | 1022 | 597 | 2108 | 992 | 93 | 19 | 0 | 16 | 4847 |
## | 2.044 | 810.449 | 593.257 | 1011.740 | 21.435 | 325.875 | 9.204 | 4.752 | |
## | 0.211 | 0.123 | 0.435 | 0.205 | 0.019 | 0.004 | 0.000 | 0.003 | 0.027 |
## | 0.028 | 0.009 | 0.045 | 0.071 | 0.043 | 0.001 | 0.000 | 0.046 | |
## | 0.006 | 0.003 | 0.012 | 0.005 | 0.001 | 0.000 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## NEWTON | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
## | 0.202 | 0.373 | 2.142 | 0.077 | 0.012 | 0.075 | 0.002 | 0.002 | |
## | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## READVILLE | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
## | 6.322 | 0.746 | 0.515 | 0.155 | 0.024 | 0.150 | 0.004 | 0.004 | |
## | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## ROSLINDALE | 1094 | 1022 | 5200 | 1526 | 295 | 50 | 1 | 22 | 9210 |
## | 313.514 | 1694.169 | 3376.476 | 924.726 | 312.188 | 593.151 | 15.546 | 1.021 | |
## | 0.119 | 0.111 | 0.565 | 0.166 | 0.032 | 0.005 | 0.000 | 0.002 | 0.051 |
## | 0.030 | 0.015 | 0.112 | 0.109 | 0.137 | 0.004 | 0.003 | 0.063 | |
## | 0.006 | 0.006 | 0.029 | 0.008 | 0.002 | 0.000 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## ROXBURY | 2283 | 1230 | 1642 | 256 | 38 | 715 | 0 | 4 | 6168 |
## | 868.588 | 497.626 | 1.857 | 103.056 | 17.186 | 138.855 | 11.713 | 5.230 | |
## | 0.370 | 0.199 | 0.266 | 0.042 | 0.006 | 0.116 | 0.000 | 0.001 | 0.034 |
## | 0.063 | 0.018 | 0.035 | 0.018 | 0.018 | 0.053 | 0.000 | 0.011 | |
## | 0.013 | 0.007 | 0.009 | 0.001 | 0.000 | 0.004 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## ROXBURY CROSSING | 612 | 592 | 375 | 106 | 9 | 135 | 0 | 3 | 1832 |
## | 159.346 | 12.143 | 19.777 | 9.106 | 7.554 | 0.034 | 3.479 | 0.079 | |
## | 0.334 | 0.323 | 0.205 | 0.058 | 0.005 | 0.074 | 0.000 | 0.002 | 0.010 |
## | 0.017 | 0.009 | 0.008 | 0.008 | 0.004 | 0.010 | 0.000 | 0.009 | |
## | 0.003 | 0.003 | 0.002 | 0.001 | 0.000 | 0.001 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## SOUTH BOSTON | 2629 | 8794 | 1671 | 143 | 41 | 2141 | 4 | 16 | 15439 |
## | 75.245 | 1602.733 | 1334.739 | 927.394 | 111.242 | 839.607 | 21.863 | 6.352 | |
## | 0.170 | 0.570 | 0.108 | 0.009 | 0.003 | 0.139 | 0.000 | 0.001 | 0.085 |
## | 0.072 | 0.131 | 0.036 | 0.010 | 0.019 | 0.158 | 0.012 | 0.046 | |
## | 0.015 | 0.049 | 0.009 | 0.001 | 0.000 | 0.012 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## WEST ROXBURY | 1915 | 696 | 6158 | 1646 | 402 | 91 | 0 | 67 | 10975 |
## | 40.103 | 2818.481 | 3932.114 | 744.318 | 561.644 | 649.744 | 20.841 | 99.444 | |
## | 0.174 | 0.063 | 0.561 | 0.150 | 0.037 | 0.008 | 0.000 | 0.006 | 0.061 |
## | 0.053 | 0.010 | 0.132 | 0.118 | 0.187 | 0.007 | 0.000 | 0.193 | |
## | 0.011 | 0.004 | 0.034 | 0.009 | 0.002 | 0.001 | 0.000 | 0.000 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
## Column Total | 36420 | 67348 | 46495 | 13996 | 2154 | 13523 | 343 | 348 | 180627 |
## | 0.202 | 0.373 | 0.257 | 0.077 | 0.012 | 0.075 | 0.002 | 0.002 | |
## ----------------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
##
##
V4_long <- melt(V4, id.vars ="roof_structure", value.name = "Count")
V4_long <- filter(V4_long, y !="")
V4_long <- filter(V4_long, x !="")
We have reshaped the data into a long format using
melt() so to plot stacked bar graph.
ggplot(V4_long, aes(x = x, y = Count, fill = y)) +
geom_bar(stat = "identity") +
labs(title = "Roof Structure Counts by City", x = "City", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
scale_y_continuous(
breaks = seq(0,35000, by=3000),
limits = c(0,45000))
I have used a stacked bar graph to plot the roof structure counts by city. From the above we can conclude that BOSTON uses F- Flat roof type the most followed by SOUTH BOSTON and DORCHESTER. For cities like BROOKLINE, DEDHAM, NEWTON and READVILLE the data is very limited which is the reason it is not showing above.
V5 <- subset(property_extract, select = c(land_sf, land_value))
V5 <- na.omit(V5)
V5 <- filter(V5, land_value !=0)
Used filter() and omit() function to
exclude variables such as 0 or null.
V5_correlation <- cor(V5$land_sf, V5$land_value)
#plotted a scatterplot for years of professional coding and salary with correlation
ggplot(V5, aes(x =land_sf , y = land_value)) +
geom_point(size=0.0025) +
labs(title = "Relationship between Land Area in sq_feet and Land Value",
x= "Land Area in sq",
y= "Land Value in $") +
scale_y_continuous(
breaks = seq(0,270000, by=45000),
limits = c(0,300000))+
scale_x_continuous(
breaks = seq(0,10000, by=2000),
limits = c(0,8000))+
geom_text(x = 7000, y = 15000, label = paste("CORRELATION: ", round(V5_correlation, 3)))
## Warning: Removed 28371 rows containing missing values (`geom_point()`).
The above scatter plot is used to denote the correlation between land_value and land_sf. The graph is scattered mainly between 4000 land area and ranges $180000 to $225000 in land value. The points are showing upward trend which denotes that land value increases with the land area. There are other variables also involved in determining the land value for instance area, amenities etc which affect the value. Therefore we can see the graph scattered all over the place. The correlation of land value and land area is 0.413.
V6<- subset(property_extract, select = bed_rms)
V6 <- na.omit(V6)
V6_plot<- ggplot(V6, aes(x = bed_rms)) +
geom_histogram(binwidth = 1
, fill = "cyan", color = "white") +
labs(title = "Distribution of Number of Bedrooms", x = "Number of Bedrooms in a parcel", y = "Count")
ggplotly(V6_plot)
The above histogram shows the distribution of number of bedrooms. As per the above graph, we can observe that most parcels have bedroom count of 2 followed by bedroom count of 3.